pacman::p_load(zoo,dplyr,readxl,stringr,stringi,tidyr,readr)
rm(list=ls())
################################################################################

###################################################### Brazil

################################ Establishment size group (ha)

#################### 1995

###### Size (ha) by activity group

### Number of establishments 
df <- read_excel('../../data/size/raw/tabela312/tabela312_seasonallivestock.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
colnames(df) <- c('county_id', 'county', 'n1', 'total', 'total',
                  '0_1_ha', '0_1_ha','1_2_ha', '1_2_ha','2_5_ha', '2_5_ha','5_10_ha', '5_10_ha',
                  '10_20_ha', '10_20_ha','20_50_ha', '20_50_ha','50_100_ha', '50_100_ha',
                  '100_200_ha', '100_200_ha','200_500_ha', '200_500_ha','500_1000_ha', '500_1000_ha',
                  '1000_2000_ha', '1000_2000_ha','2000_5000_ha', '2000_5000_ha',
                  '5000_10000_ha', '5000_10000_ha','10000_100000_ha', '10000_100000_ha',
                  '100000_ha', '100000_ha','none', 'none')
colnames(df) <- make.unique(colnames(df))
df <- df[, -(2:3)]
df <- df[-(1:4), ]
df <- df %>% mutate(across(everything(), ~str_replace_all(., c(
  'X' = '0','-' = '0','Lavoura temporária' = 'crops_seasonal','Pecuária' = 'livestock' ))))
df_w <- df
df_1 <- df %>% dplyr::select(where(~ any(. == 'crops_seasonal',na.rm = TRUE))) %>%
  slice(-1) %>%   mutate(group_item = 'crops_seasonal') %>% mutate(county_id = df_w$county_id[1:n()])
df_2 <- df_w %>% dplyr::select(where(~ any(. == 'livestock', na.rm = TRUE))) %>% slice(-1) 
names(df_2) <- gsub("\\.1$", "", names(df_2))
df_2 <- df_2 %>% mutate(group_item = 'livestock', county_id = df_w$county_id[1:(n())])
df_e_sl <- bind_rows(df_1, df_2)

df <- read_excel('../../data/size/raw/tabela312/tabela312_total.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
colnames(df) <- c(
  'county_id', 'county', 'n1', 'total',
  '0_1_ha', '1_2_ha', '2_5_ha', '5_10_ha',
  '10_20_ha', '20_50_ha', '50_100_ha', '100_200_ha',
  '200_500_ha', '500_1000_ha', '1000_2000_ha',
  '2000_5000_ha', '5000_10000_ha', '10000_100000_ha',
  '100000_ha', 'none')
df <- df %>% dplyr::select(-c(2,3))
df <- df[-(1:4), ]
df[df == "X"] <- "0"
df[df == "-"] <- "0"
df[df == "Total"] <- "total"
df_w <- df
df <- df %>% dplyr::select(where(~ any(. == 'total', na.rm = TRUE)))
df <- df %>% slice(-1)
df <- df %>% mutate(group_item = 'total',county_id = df_w$county_id[2:(nrow(df) + 1)])
df_e_t <- df
#Append
df_e <- bind_rows(df_e_t, df_e_sl) %>%  mutate(variable = "establishments") %>% dplyr::select(-none)

### Area (ha)
df <- read_excel('../../data/size/raw/tabela491/tabela491_seasonallivestock.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
colnames(df) <- c('county_id', 'county', 'n1', 'total', 'total',
                  '0_1_ha', '0_1_ha', '1_2_ha', '1_2_ha', '2_5_ha', '2_5_ha', 
                  '5_10_ha', '5_10_ha', '10_20_ha', '10_20_ha', '20_50_ha', '20_50_ha',
                  '50_100_ha', '50_100_ha', '100_200_ha', '100_200_ha', '200_500_ha', '200_500_ha',
                  '500_1000_ha', '500_1000_ha', '1000_2000_ha', '1000_2000_ha', 
                  '2000_5000_ha', '2000_5000_ha', '5000_10000_ha', '5000_10000_ha',
                  '10000_100000_ha', '10000_100000_ha', '100000_ha', '100000_ha', 'none')
colnames(df) <- make.unique(colnames(df))
df <- df %>% dplyr::select(-2, -3)
df <- df[-c(1:4), ]
df <- df %>% mutate(across(everything(), ~str_replace_all(., c('X' = '0', '-' = '0',
                                                               'Lavoura temporária' = 'crops_seasonal', 
                                                               'Pecuária' = 'livestock'))))
df_w <- df
df_1 <- df %>% dplyr::select(where(~ any(. == 'crops_seasonal',na.rm = TRUE))) %>%
  slice(-1) %>%  mutate(group_item = 'crops_seasonal') %>% mutate(county_id = df_w$county_id[-1])
df_2 <- df_w %>% dplyr::select(where(~ any(. == 'livestock', na.rm = TRUE))) %>%  slice(-1) 
names(df_2) <- gsub("\\.1$", "", names(df_2))
df_2 <- df_2 %>% mutate(group_item = 'livestock',county_id = df_w$county_id[-1])
df_a_sl <- bind_rows(df_1, df_2)

df <- read_excel('../../data/size/raw/tabela491/tabela491_total.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
colnames(df) <- c(
  'county_id', 'county', 'n1', 'total',
  '0_1_ha', '1_2_ha', '2_5_ha', '5_10_ha', '10_20_ha', '20_50_ha',
  '50_100_ha', '100_200_ha', '200_500_ha', '500_1000_ha',
  '1000_2000_ha', '2000_5000_ha', '5000_10000_ha', 
  '10000_100000_ha', '100000_ha', 'none')

df <- df %>% dplyr::select(-2, -3)
df <- df[-c(1:4), ]
df <- df %>% mutate(across(everything(), ~str_replace_all(., c('X' = '0', '-' = '0', 'Total' = 'total'))))
df_w <- df
df_a_t <- df_w %>%  dplyr::select(where(~ any(. == 'total', na.rm = TRUE))) %>% slice(-1) %>%
  mutate(group_item = 'total',county_id = df_w$county_id[-1])
df_a_t <- df_a_t %>% dplyr::select(-none)

#Append area data
df_a <- bind_rows(df_a_t, df_a_sl)
df_a <- df_a %>% mutate(variable = 'area_ha')

## Append all group data
df <- bind_rows(df_a, df_e)
df <- df %>% mutate(group_type = 'activity')
df <- df %>% mutate(county_id = paste0('BR', as.character(county_id)))
df_ac <- df %>% pivot_longer(cols = -c(group_type, group_item, variable, county_id),names_to = 'size_bin',values_to = 'value')



###### Size (ha) by land use group

### Number of establishments 
df <- read_excel('../../data/size/raw/tabela311.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df[3:nrow(df), ]
colnames(df) <- df[1, ]
df <- df[-1, ]
colnames(df) <- c('county_id', 'group_item', 'total', '0_1_ha', '1_2_ha', '2_5_ha', '5_10_ha',
                  '10_20_ha', '20_50_ha', '50_100_ha', '100_200_ha', '200_500_ha', '500_1000_ha',
                  '1000_2000_ha', '2000_5000_ha', '5000_10000_ha', '10000_100000_ha', '100000_ha', 'none')
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>% mutate(across(everything(), ~ str_replace_all(., c(
    'X' = '0',
    '-' = '0',
    'Total' = 'total',
    'Lavouras permanentes' = 'crops_permanent',
    'Lavouras temporárias' = 'crops_seasonal',
    'Lavouras temporárias em descanso' = 'crops_seasonal_fallow',
    'Pastagens naturais' = 'pasture_natural',
    'Pastagens plantadas' = 'pasture_planted',
    'Matas e florestas naturais' = 'forest_natural',
    'Matas e florestas artificiais' = 'forest_planted',
    'Terras produtivas não utilizadas' = 'not_used',
    'Terras inaproveitáveis' = 'not_usable'))))
df <- df %>% mutate(variable = 'establishments')
df <- df[-1, ]
df_e <- df

### Area (ha)
df <- read_excel('../../data/size/raw/tabela316.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df[-(1:3), ]
colnames(df) <- as.character(df[1, ])
df <- df[-1, ]
colnames(df) <- c(
  'county_id', 'group_item', 'total',
  '0_1_ha', '1_2_ha', '2_5_ha', '5_10_ha', '10_20_ha', '20_50_ha', 
  '50_100_ha', '100_200_ha', '200_500_ha', '500_1000_ha', 
  '1000_2000_ha', '2000_5000_ha', '5000_10000_ha', 
  '10000_100000_ha', '100000_ha', 'none')
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>% mutate(across(everything(), ~ gsub('X|\\-', '0', .))) %>%
  mutate(across(everything(), ~ gsub('Total', 'total', .))) %>%
  mutate(across(everything(), ~ gsub('Lavouras permanentes', 'crops_permanent', .) )) %>%
  mutate(across(everything(), ~ gsub('Lavouras temporárias', 'crops_seasonal', .) )) %>%
  mutate(across(everything(), ~ gsub('Lavouras temporárias em descanso', 'crops_seasonal_fallow', .))) %>%
  mutate(across(everything(), ~ gsub('Pastagens naturais', 'pasture_natural', .))) %>%
  mutate(across(everything(), ~ gsub('Pastagens plantadas', 'pasture_planted', .))) %>%
  mutate(across(everything(), ~ gsub('Matas e florestas naturais', 'forest_natural', .))) %>%
  mutate(across(everything(), ~ gsub('Matas e florestas artificiais', 'forest_planted', .))) %>%
  mutate(across(everything(), ~ gsub('Terras produtivas não utilizadas', 'not_used', .))) %>%
  mutate(across(everything(), ~ gsub('Terras inaproveitáveis', 'not_usable', .) ))
df <- df %>% mutate(variable = 'area_ha')
df_a <- df

##Merge
df <- bind_rows(df_a, df_e)
df$group_type <- 'land_use'
df_lu <- df %>% pivot_longer(
  cols = c('total', '0_1_ha', '1_2_ha', '2_5_ha', '5_10_ha', '10_20_ha', '20_50_ha',
           '50_100_ha', '100_200_ha', '200_500_ha', '500_1000_ha', '1000_2000_ha',
           '2000_5000_ha', '5000_10000_ha', '10000_100000_ha', '100000_ha', 'none'),
  names_to = 'size_bin', values_to = 'value')


###### Size (ha) by crop-specific land use

### Number of establishments 
df <- read_excel('../../data/size/raw/tabela492.xlsx', sheet = 1, .name_repair = "unique_quiet")
colnames(df) <- c('county_id', 'n1', 'size_bin', 'maize', 'soybean')
df <- df %>% dplyr::select(-n1)
df <- df[-(1:4), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%  mutate(across(everything(), ~ gsub('X|\\-', '0', .))) %>%
  mutate(size_bin = recode(size_bin,
    'Total'='total',
    'Menos de 1 ha' = '0_1_ha',
    '1 a menos de 2 ha' = '1_2_ha',
    '2 a menos de 5 ha' = '2_5_ha',
    '5 a menos de 10 ha' = '5_10_ha',
    '10 a menos de 20 ha' = '10_20_ha',
    '20 a menos de 50 ha' = '20_50_ha',
    '50 a menos de 100 ha' = '50_100_ha',
    '100 a menos de 200 ha' = '100_200_ha',
    '200 a menos de 500 ha' = '200_500_ha',
    '500 a menos de 1.000 ha' = '500_1000_ha',
    '1.000 a menos de 2.000 ha' = '1000_2000_ha',
    '2.000 a menos de 5.000 ha' = '2000_5000_ha',
    '5.000 a menos de 10.000 ha' = '5000_10000_ha',
    '10.000 a menos de 100.000 ha' = '10000_100000_ha',
    '100.000 ha e mais' = '100000_ha',
    'Sem declaração' = 'none' ))
df <- df %>% mutate(variable = 'establishments')
df_e <- df

### Area (ha)
df <- read_excel('../../data/size/raw/tabela503.xlsx', sheet = 1, .name_repair = "unique_quiet")
colnames(df) <- c('county_id', 'n1', 'size_bin', 'maize', 'soybean')
df <- df %>% dplyr::select(-n1)
df <- df[-(1:4), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>% mutate(across(everything(), ~ gsub('X|\\-', '0', .))) %>%
  mutate(size_bin = recode(size_bin,
    'Total'='total',
    'Menos de 1 ha' = '0_1_ha',
    '1 a menos de 2 ha' = '1_2_ha',
    '2 a menos de 5 ha' = '2_5_ha',
    '5 a menos de 10 ha' = '5_10_ha',
    '10 a menos de 20 ha' = '10_20_ha',
    '20 a menos de 50 ha' = '20_50_ha',
    '50 a menos de 100 ha' = '50_100_ha',
    '100 a menos de 200 ha' = '100_200_ha',
    '200 a menos de 500 ha' = '200_500_ha',
    '500 a menos de 1.000 ha' = '500_1000_ha',
    '1.000 a menos de 2.000 ha' = '1000_2000_ha',
    '2.000 a menos de 5.000 ha' = '2000_5000_ha',
    '5.000 a menos de 10.000 ha' = '5000_10000_ha',
    '10.000 a menos de 100.000 ha' = '10000_100000_ha',
    '100.000 ha e mais' = '100000_ha',
    'Sem declaração' = 'none'))
df <- df %>% mutate(variable = 'area_ha')
df_a <- df

###### Append all group data

df <- bind_rows(df_a, df_e)
df <- df %>% mutate(group_type = 'crop_type')
df_cr <- df %>% pivot_longer(cols = -c(group_type, size_bin, variable, county_id),
                             names_to = "group_item",values_to = "value")
df_cr <- df_cr %>% dplyr::select(all_of(names(df_ac)))

## Bind all
df <- bind_rows(df_lu, df_ac, df_cr)
df <- df %>% mutate(year = '1995')
df95 <- df %>% filter(size_bin %in% c(
    '0_1_ha', '1_2_ha', '2_5_ha', '5_10_ha', '10_20_ha',
    '20_50_ha', '50_100_ha', '100_200_ha', '200_500_ha',
    '500_1000_ha', '1000_2000_ha', '2000_5000_ha',
    '5000_10000_ha', '10000_100000_ha', '100000_ha'))




#################### 2006

###### Size (ha) by activity

### Number of establishments
df <- readxl::read_excel('../../data/size/raw/tabela837/tabela837_nha_totallivestock.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df[-c(1:3), ]
colnames(df) <- as.character(df[1, ]) 
colnames(df) <- c('county_id', 'none', 'size_bin', 'total', 'livestock')
df <- df[-1, ]    
df <- df %>% dplyr::select(-none)
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~ gsub('X|\\-', 0, .))) %>%
  mutate(across(everything(), ~ gsub('Total', 'total', .))) %>%
  mutate(across(everything(), ~ gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 2.500 ha e mais', '2500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('Produtor sem área', 'none', .)))
df_e_tl <- df %>% pivot_longer(cols = c('total', 'livestock'), names_to = 'group_item', values_to = 'value')
df_e_tl <- df_e_tl[-c(1, 2), ]

df <- read_excel('../../data/size/raw/tabela837/tabela837_n_seasonal.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ] 
colnames(df) <- c('county_id', 'n1', 'n2', 'size_bin', 'crops_seasonal')

df <- df[-c(1:5), ]     
df <- df %>% dplyr::select(-n1, -n2) 
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~ gsub('X|\\-', 0, .))) %>%
  mutate(across(everything(), ~ gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 2.500 ha e mais', '2500_ha', .)))
df_e_s <- df %>% pivot_longer(cols = -c(county_id, size_bin),
                              names_to = "group_item",values_to = "value")
#Append establishment data
df_e <- bind_rows(df_e_tl, df_e_s) %>% mutate(variable = 'establishments')


### Area (ha)
df <- read_excel('../../data/size/raw/tabela837/tabela837_nha_totallivestock.xlsx', sheet = 2,skip=4, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]        
colnames(df) <- as.character(df[1, ])
df <- df[-1, ]              
colnames(df) <- c('county_id', 'none', 'size_bin', 'total', 'livestock')
df <- df %>% dplyr::select(-none)
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~ gsub('X|\\-', 0, .))) %>%
  mutate(across(everything(), ~ gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 2.500 ha e mais', '2500_ha', .)))
df_a_tl <- df %>%  pivot_longer(cols = c('total', 'livestock'),
                                names_to = "group_item",values_to = "value")

df <- read_excel("../../data/size/raw/tabela837/tabela837_ha_seasonal.xlsx", sheet = 1,skip=4, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]        
colnames(df) <- as.character(df[1, ])
df <- df[-1, ]  
colnames(df) <- c('county_id', 'n1', 'n2', 'size_bin', 'crops_seasonal')
df <- df %>% dplyr::select(-n1, -n2)
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~ gsub('X|-', '0', .))) %>%
  mutate(across(everything(), ~ gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 2.500 ha e mais', '2500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('Produtor sem área', 'none', .)))

df_a_s <- df %>% pivot_longer(cols = -c(county_id, size_bin),names_to = "group_item",values_to = "value" )

#Append area data
df_a <- bind_rows(df_a_tl, df_a_s) %>% mutate(variable = "area_ha")

## Append all group data
df <- bind_rows(df_a, df_e) %>% mutate(group_type = "activity")
df_ac <- df %>% dplyr::select(group_type, group_item, variable, county_id, size_bin, value)


###### Size (ha) by land use group

df <- read_excel('../../data/size/raw/tabela854.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-3)
colnames(df) <- c('county_id', 'size_bin',
                  'crops_permanent', 'crops_seasonal', 'crops_forage', 'floriculture',
                  'pasture_natural', 'pasture_planted_degraded', 'pasture_planted',
                  'forest_natural_protected', 'forest_natural_unprotected', 'forest_planted', 'agroforestry',
                  'aquaculture', 'construction', 'degraded_land', 'not_usable')
df <- df[-c(1:6), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~gsub('X', '0', .))) %>%
  mutate(across(everything(), ~gsub('-', '0', .))) %>%
  mutate(across(everything(), ~gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2.500 ha e mais', '2500_ha', .)))

df <- df %>% mutate(variable = "establishments")
df_e <- df


### Area (ha)
df <- read_excel('../../data/size/raw/tabela854.xlsx', sheet = 2, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-3)
colnames(df) <- c('county_id', 'size_bin',
                  'crops_permanent', 'crops_seasonal', 'crops_forage', 'floriculture',
                  'pasture_natural', 'pasture_planted_degraded', 'pasture_planted',
                  'forest_natural_protected', 'forest_natural_unprotected', 'forest_planted', 'agroforestry',
                  'aquaculture', 'construction', 'degraded_land', 'not_usable')
df <- df[-c(1:6), ]
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>% mutate(county_id = ifelse(county_id == "BRNA", lag(county_id), county_id))
df <- df %>%
  mutate(across(everything(), ~gsub('X', '0', .))) %>%
  mutate(across(everything(), ~gsub('-', '0', .))) %>%
  mutate(across(everything(), ~gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2.500 ha e mais', '2500_ha', .)))
df <- df %>% mutate(variable = "area_ha")
df_a <- df

## Append all group data
df <- bind_rows(df_e, df_a)
df <- df %>% mutate(group_type = "land_use")
df_lu <- df %>%  pivot_longer(cols = -c(group_type, size_bin, variable, county_id),
    names_to = "group_item",values_to = "value" )
df_lu <- df_lu %>% dplyr::select(group_type, group_item, variable, county_id, size_bin, value)


###### Size (ha) by crop-specific land use

### Number of establishments 
df <- read_excel('../../data/size/raw/tabela822.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-3)
colnames(df) <- c('county_id', 'size_bin', 'maize1', 'maize2', 'maize3', 'soybean1', 'soybean2', 'soybean3')
df <- df %>% mutate(maize = maize1,soybean = soybean1 ) %>% dplyr::select(county_id, size_bin, maize, soybean)
df <- df[-c(1:6), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~gsub('X', '0', .))) %>%
  mutate(across(everything(), ~gsub('-', '0', .))) %>%
  mutate(across(everything(), ~gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2.500 ha e mais', '2500_ha', .)))
df <- df %>%  mutate(variable = 'establishments')
df_e <- df


### Area (ha)
df <- read_excel('../../data/size/raw/tabela822.xlsx', sheet = 2, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-3)
colnames(df) <- c('county_id', 'size_bin', 'maize1', 'maize2', 'maize3', 'soybean1', 'soybean2', 'soybean3')
df <- df %>% mutate(maize = maize1, soybean = soybean1) %>% dplyr::select(county_id, size_bin, maize, soybean)
df <- df[-c(1:6), ]
df <- df %>%  mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~gsub('X', '0', .))) %>%
  mutate(across(everything(), ~gsub('-', '0', .))) %>%
  mutate(across(everything(), ~gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2.500 ha e mais', '2500_ha', .)))
df <- df %>% mutate(variable = 'area_ha')
df_a <- df

## Append all group data
df_cr <- bind_rows(df_e, df_a) %>% mutate(group_type = 'crop_type') %>%
  pivot_longer(cols = -c(group_type, size_bin, variable, county_id),
               names_to = 'group_item',values_to = 'value' ) %>%
  dplyr::select(all_of(colnames(df_ac)))

## Bind all
df06 <- bind_rows(df_lu, df_ac, df_cr) %>% mutate(year = '2006') %>%
  filter(size_bin %in% c('0_01_ha', '01_02_ha', '02_05_ha', '05_1_ha', '1_2_ha', '2_3_ha', '3_4_ha', '4_5_ha',
                         '5_10_ha', '10_20_ha', '20_50_ha', '50_100_ha', '100_200_ha', '200_500_ha', '500_1000_ha',
                         '1000_2500_ha', '2500_ha'))


#################### 2017

###### Size (ha) by activity

### Number of establishments 
df <- read_excel("../../data/size/raw/tabela6880/tabela6880_n_total.xlsx", sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-c(2))
colnames(df) <- c('county_id', 'none', 'size_bin', 'value')
df <- df %>% dplyr::select(-none)
df <- df[-c(1:5), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>% fill(county_id, .direction = "down")

df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~gsub('X', '0', .))) %>%
  mutate(across(everything(), ~gsub('-', '0', .))) %>%
  mutate(across(everything(), ~gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2.500 a menos de 10.000 ha', '2500_10000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10.000 ha e mais', '10000_ha', .)))
df <- df %>% mutate(group_item = "total")
df_e_t <- df

df <- read_excel("../../data/size/raw/tabela6880/tabela6880_n_seasonal.xlsx", sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-c(2))
colnames(df) <- c('county_id', 'none', 'size_bin', 'value')
df <- df %>% dplyr::select(-none)
df <- df[-c(1:5), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>% fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~gsub('X', '0', .))) %>%
  mutate(across(everything(), ~gsub('-', '0', .))) %>%
  mutate(across(everything(), ~gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2.500 a menos de 10.000 ha', '2500_10000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10.000 ha e mais', '10000_ha', .)))
df <- df %>% mutate(group_item = "crops_seasonal")
df_e_s <- df

df <- read_excel("../../data/size/raw/tabela6880/tabela6880_n_livestock.xlsx", sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-c(2))
colnames(df) <- c('county_id', 'none', 'size_bin', 'value')
df <- df %>% dplyr::select(-none)
df <- df[-c(1:5), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id) ) %>% fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~gsub('X', '0', .))) %>%
  mutate(across(everything(), ~gsub('-', '0', .))) %>%
  mutate(across(everything(), ~gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2.500 a menos de 10.000 ha', '2500_10000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10.000 ha e mais', '10000_ha', .)))

df <- df %>% mutate(group_item = "livestock")
df_e_l <- df

#Append all establishment data
df_e <- bind_rows(df_e_t,df_e_s,df_e_l)
df_e <- df_e %>% mutate(variable = 'establishments')

 
### Area (ha)
df <- read_excel("../../data/size/raw/tabela6880/tabela6880_ha_total.xlsx", sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-c(2))
colnames(df) <- c('county_id', 'none', 'size_bin', 'value')
df <- df %>% dplyr::select(-none)
df <- df[-c(1:5), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>% fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~gsub('X', '0', .))) %>%
  mutate(across(everything(), ~gsub('-', '0', .))) %>%
  mutate(across(everything(), ~gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2.500 a menos de 10.000 ha', '2500_10000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10.000 ha e mais', '10000_ha', .)))
df <- df %>% mutate(group_item = "total")
df_a_t <- df

df <- read_excel("../../data/size/raw/tabela6880/tabela6880_ha_seasonal.xlsx", sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-c(2))
colnames(df) <- c('county_id', 'none', 'size_bin', 'value')
df <- df %>% dplyr::select(-none)
df <- df[-c(1:5), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>% fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~gsub('X', '0', .))) %>%
  mutate(across(everything(), ~gsub('-', '0', .))) %>%
  mutate(across(everything(), ~gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2.500 a menos de 10.000 ha', '2500_10000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10.000 ha e mais', '10000_ha', .)))
df <- df %>% mutate(group_item = "crops_seasonal")
df_a_s <- df

df <- read_excel("../../data/size/raw/tabela6880/tabela6880_ha_livestock.xlsx", sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-c(2))
colnames(df) <- c('county_id', 'none', 'size_bin', 'value')
df <- df %>% dplyr::select(-none)
df <- df[-c(1:5), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>% fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~gsub('X', '0', .))) %>%
  mutate(across(everything(), ~gsub('-', '0', .))) %>%
  mutate(across(everything(), ~gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2.500 a menos de 10.000 ha', '2500_10000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10.000 ha e mais', '10000_ha', .)))

df <- df %>% mutate(group_item = "livestock")
df_a_l <- df

#Append all establishment data
df_a <- bind_rows(df_a_t,df_a_s,df_a_l)
df_a <- df_a %>% mutate(variable = 'area_ha')

## Append all group data
df <- bind_rows(df_a, df_e)
df <- df %>% mutate(group_type = "activity")
df_ac <- df %>% dplyr::select(group_type, group_item, variable, county_id, size_bin, value)


###### Size (ha) by land use group

### Number of establishments 
df <- read_excel('../../data/size/raw/tabela6882.xlsx', sheet = 1,skip=4, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
colnames(df) <- as.character(df[2, ])
df <- df[-c(1:2), ]
colnames(df) <- c('county_id', 'none', 'size_bin', 'total',
                  'crops_permanent', 'crops_seasonal', 'crops_flowers',
                  'pasture_natural', 'pasture_planted', 'pasture_planted_degraded',
                  'forest_natural_protected', 'forest_natural_unprotected',
                  'forest_planted', 'agroforestry', 'other_uses')
df <- df %>% dplyr::select(-none)
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down") %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~ gsub('X|-', '0', .))) %>%
  mutate(across(everything(), ~ gsub('Total', 'total', .))) %>%
  mutate(across(everything(), ~ gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 2.500 a menos de 10.000 ha', '2500_10000_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 10.000 ha e mais', '10000_ha', .))) %>%
  mutate(across(everything(), ~ gsub('Produtor sem área', 'none', .)))
df <- df %>% mutate(variable = "establishments")
df_e <- df


### Area (ha)
df <- read_excel('../../data/size/raw/tabela6882.xlsx', sheet = 2,skip=4, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
colnames(df) <- as.character(df[2, ])
df <- df[-c(1:2), ]
colnames(df) <- c('county_id', 'none', 'size_bin', 'total',
                  'crops_permanent', 'crops_seasonal', 'crops_flowers',
                  'pasture_natural', 'pasture_planted', 'pasture_planted_degraded',
                  'forest_natural_protected', 'forest_natural_unprotected',
                  'forest_planted', 'agroforestry', 'other_uses')
df <- df %>% dplyr::select(-none)
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down") %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~ gsub('X|-', '0', .))) %>%
  mutate(across(everything(), ~ gsub('Total', 'total', .))) %>%
  mutate(across(everything(), ~ gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 2.500 a menos de 10.000 ha', '2500_10000_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 10.000 ha e mais', '10000_ha', .))) %>%
  mutate(across(everything(), ~ gsub('Produtor sem área', 'none', .)))
df <- df %>% mutate(variable = "area_ha")
df_a <- df

## Append all group data
df <- bind_rows(df_a, df_e)
df <- df %>% mutate(group_type = "land_use")
df_lu <- df %>% pivot_longer(cols = -c(group_type, size_bin, variable, county_id),
    names_to = "group_item",values_to = "value")
df_lu <- df_lu %>%dplyr::select(all_of(colnames(df_ac)))


###### Size (ha) by crop-specific land use

### Number of establishments 
df <- read_excel('../../data/size/raw/tabela6959.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df[-(1:5), ]
colnames(df) <- c('county_id', 'size_bin', 'maize', 'soybean')
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down") %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~gsub('X|\\-', '0', .))) %>%
  mutate(across(everything(), ~gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2.500 a menos de 10.000 ha', '2500_10000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10.000 ha e mais', '10000_ha', .)))
df <- df %>% mutate(variable = "establishments")
df_e <- df

### Area (ha)
df <- read_excel('../../data/size/raw/tabela6959.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df[-(1:5), ]
colnames(df) <- c('county_id', 'size_bin', 'maize', 'soybean')
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down") %>% mutate(county_id = paste0("BR", county_id))
df <- df %>%
  mutate(across(everything(), ~gsub('X|\\-', '0', .))) %>%
  mutate(across(everything(), ~gsub('Mais de 0 a menos de 0,1 ha', '0_01_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,1 a menos de 0,2 ha', '01_02_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,2 a menos de 0,5 ha', '02_05_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 0,5 a menos de 1 ha', '05_1_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2 a menos de 3 ha', '2_3_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 3 a menos de 4 ha', '3_4_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 4 a menos de 5 ha', '4_5_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 500 a menos de 1.000 ha', '500_1000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 1.000 a menos de 2.500 ha', '1000_2500_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 2.500 a menos de 10.000 ha', '2500_10000_ha', .))) %>%
  mutate(across(everything(), ~gsub('De 10.000 ha e mais', '10000_ha', .)))
df <- df %>% mutate(variable = "area_ha")
df_a <- df

## Append all group data
df <- bind_rows(df_e, df_a)
df <- df %>% mutate(group_type = 'crop_type')
df_cr <- df %>% pivot_longer(cols = -c(group_type, size_bin, variable, county_id),names_to = "group_item", values_to = "value")
df_cr <- df_cr %>% dplyr::select(all_of(colnames(df_ac)))


### Bind all
df <- bind_rows(df_lu, df_ac, df_cr)
df <- df %>% mutate(year = '2017')
df17 <- df %>% filter(size_bin %in% c(
    '0_01_ha', '01_02_ha', '02_05_ha', '05_1_ha', '1_2_ha', '2_3_ha', '3_4_ha', '4_5_ha',
    '5_10_ha', '10_20_ha', '20_50_ha', '50_100_ha', '100_200_ha', '200_500_ha', '500_1000_ha',
    '1000_2500_ha', '2500_10000_ha', '10000_ha'))

###Append all years
df <- bind_rows(df95, df06, df17) %>% mutate(size_bin_type = 'establishment_size_ha') %>%
  dplyr::select(group_type, group_item, variable, county_id, size_bin, value, year, size_bin_type)
write_csv(df, file = "../../data/size/clean/brazil_size_distribution_land.csv.gz")




################################ Livestock size groups (ha, herd size)

#################### 1995

###### Size (ha) by activity

### Cattle stock 
df <- read_excel('../../data/size/raw/tabela323.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-c(2,3))
colnames(df) <- c('county_id', 'total', '0_1_ha', '1_2_ha', '2_5_ha', '5_10_ha', '10_20_ha',
                  '20_50_ha', '50_100_ha', '100_200_ha', '200_500_ha', '500_1000_ha', '1000_2000_ha',
                  '2000_5000_ha', '5000_10000_ha', '10000_100000_ha', '100000_ha', 'none')
df <- df[5:nrow(df), ]
df <- df %>% mutate(county_id = paste0('BR', county_id)) %>%
  mutate(across(everything(), ~gsub('X', '0', .))) %>%
  mutate(across(everything(), ~gsub('-', '0', .))) 
df <- df %>% mutate(group_type = 'activity', group_item = 'total', variable = 'head') %>%
  pivot_longer(cols = c('total', '0_1_ha', '1_2_ha', '2_5_ha', '5_10_ha', '10_20_ha',
                        '20_50_ha', '50_100_ha', '100_200_ha', '200_500_ha', '500_1000_ha',
                        '1000_2000_ha', '2000_5000_ha', '5000_10000_ha', '10000_100000_ha','100000_ha', 'none'),
               names_to = "size_bin", values_to = "value") %>%
  mutate(year = 1995,size_bin_type = 'establishment_size_ha') 
df95 <- df %>% filter(size_bin %in% c('0_1_ha', '1_2_ha', '2_5_ha', '5_10_ha', '10_20_ha', 
                         '20_50_ha', '50_100_ha', '100_200_ha', '200_500_ha', 
                         '500_1000_ha', '1000_2000_ha', '2000_5000_ha', 
                         '5000_10000_ha', '10000_100000_ha', '100000_ha'))

#################### 2006

###### Size (ha) by activity

### Number of establishments
df <- read_excel('../../data/size/raw/tabela922/tabela922_estabsize.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-2)
colnames(df) <- c('county_id', 'size_bin', 'total', 'livestock')
df <- df[7:nrow(df), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>% fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id)) %>%
  mutate(across(everything(), ~gsub('X', '0', .))) %>%
  mutate(across(everything(), ~gsub('-', '0', .)))
df <- df %>% mutate(variable = 'establishments') %>%
  mutate(across(size_bin, ~recode(., 
                                  'Total' = 'total',
                                  'Mais de 0 a menos de 0,1 ha' = '0_01_ha',
                                  'De 0,1 a menos de 0,2 ha' = '01_02_ha',
                                  'De 0,2 a menos de 0,5 ha' = '02_05_ha',
                                  'De 0,5 a menos de 1 ha' = '05_1_ha',
                                  'De 1 a menos de 2 ha' = '1_2_ha',
                                  'De 2 a menos de 3 ha' = '2_3_ha',
                                  'De 3 a menos de 4 ha' = '3_4_ha',
                                  'De 4 a menos de 5 ha' = '4_5_ha',
                                  'De 5 a menos de 10 ha' = '5_10_ha',
                                  'De 10 a menos de 20 ha' = '10_20_ha',
                                  'De 20 a menos de 50 ha' = '20_50_ha',
                                  'De 50 a menos de 100 ha' = '50_100_ha',
                                  'De 100 a menos de 200 ha' = '100_200_ha',
                                  'De 200 a menos de 500 ha' = '200_500_ha',
                                  'De 500 a menos de 1.000 ha' = '500_1000_ha',
                                  'De 1.000 a menos de 2.500 ha' = '1000_2500_ha',
                                  'De 2.500 ha e mais'='2500_ha',
                                  'Produtor sem área' = 'none')))
df_e <- df

### Cattle stock 
df <- read_excel('../../data/size/raw/tabela922/tabela922_estabsize.xlsx', sheet = 2, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-2)
colnames(df) <- c('county_id', 'size_bin', 'total', 'livestock')
df <- df[7:nrow(df), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id)) %>% mutate(across(everything(), ~gsub('X', '0', .))) %>%
  mutate(across(everything(), ~gsub('-', '0', .)))
df <- df %>% mutate(variable = 'head') %>%
  mutate(across(size_bin, ~recode(., 
                                  'Total' = 'total',
                                  'Mais de 0 a menos de 0,1 ha' = '0_01_ha',
                                  'De 0,1 a menos de 0,2 ha' = '01_02_ha',
                                  'De 0,2 a menos de 0,5 ha' = '02_05_ha',
                                  'De 0,5 a menos de 1 ha' = '05_1_ha',
                                  'De 1 a menos de 2 ha' = '1_2_ha',
                                  'De 2 a menos de 3 ha' = '2_3_ha',
                                  'De 3 a menos de 4 ha' = '3_4_ha',
                                  'De 4 a menos de 5 ha' = '4_5_ha',
                                  'De 5 a menos de 10 ha' = '5_10_ha',
                                  'De 10 a menos de 20 ha' = '10_20_ha',
                                  'De 20 a menos de 50 ha' = '20_50_ha',
                                  'De 50 a menos de 100 ha' = '50_100_ha',
                                  'De 100 a menos de 200 ha' = '100_200_ha',
                                  'De 200 a menos de 500 ha' = '200_500_ha',
                                  'De 500 a menos de 1.000 ha' = '500_1000_ha',
                                  'De 1.000 a menos de 2.500 ha' = '1000_2500_ha',
                                  'De 2.500 ha e mais'='2500_ha',
                                  'Produtor sem área' = 'none')))
df_c <- df


## Append all group data
df <- bind_rows(df_e, df_c)
df <- df %>% mutate(group_type = 'activity') %>%
  pivot_longer( cols = -c(group_type, size_bin, variable, county_id),
    names_to = 'group_item',values_to = 'value') %>%
  mutate(size_bin_type = 'establishment_size_ha', year = 2006) %>%
  filter(size_bin %in% c('0_01_ha', '01_02_ha', '02_05_ha', '05_1_ha', '1_2_ha', '2_3_ha', '3_4_ha', '4_5_ha',
    '5_10_ha', '10_20_ha', '20_50_ha', '50_100_ha', '100_200_ha', '200_500_ha', '500_1000_ha',
    '1000_2500_ha', '2500_ha'))
df_esh <- df %>% dplyr::select(all_of(colnames(df95)))


###### Pasture size (ha)

### Number of establishments
df <- read_excel('../../data/size/raw/tabela922/tabela922_pasture.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-3)
colnames(df) <- c(
  'county_id', 'size_bin', 
  '1_2_h', '1_2_h',
  '3_4_h', '3_4_h',
  '5_9_h', '5_9_h',
  '10_19_h', '10_19_h',
  '20_49_h', '20_49_h',
  '50_99_h', '50_99_h',
  '100_199_h', '100_199_h',
  '200_499_h', '200_499_h',
  '500_h', '500_h')
colnames(df) <- make.unique(colnames(df))
df <- df[-(1:5), ]
df <- df %>%
  mutate(across(everything(), ~ gsub('X', '0', .))) %>%
  mutate(across(everything(), ~ gsub('-', '0', .))) %>%
  mutate(across(everything(), ~ gsub('Total', 'total', .))) %>%
  mutate(across(everything(), ~ gsub('Pecuária e criação de outros animais', 'livestock', .))) %>%
  mutate(across(everything(), ~ gsub('Maior que 0 e menor que 1 ha', '0_1_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 2 a menos de 5 ha', '2_5_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 500 e mais ha', '500_ha', .)))
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df_w <- df

cols_keep <- which(df[1, ] == "total")
df <- df[, c(1, 2, cols_keep)]
df <- df[-1, ]
df <- df %>% mutate(group_item = 'total')

df_1 <- df
cols_keep <- which(df_w[1, ] == "livestock")
df_w <- df_w[, c(1, 2, cols_keep)]
df_w <- df_w[-1, ]
df_w <- df_w %>% mutate(group_item = 'livestock')

df_2 <- df_w
names(df_2) <- gsub("\\.1$", "", names(df_2))
df <- bind_rows(df_1, df_2) %>% mutate( value = rowSums(across(c(`1_2_h`, `3_4_h`, `5_9_h`, `10_19_h`, `20_49_h`,
                           `50_99_h`, `100_199_h`, `200_499_h`, `500_h`), ~ as.numeric(.)), na.rm = TRUE),
                           variable = 'establishments')
df_e <- df


### Cattle stock
df <- read_excel('../../data/size/raw/tabela922/tabela922_pasture.xlsx', sheet = 2, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-3)
colnames(df) <- c(
  'county_id', 'size_bin', 
  '1_2_h', '1_2_h',
  '3_4_h', '3_4_h',
  '5_9_h', '5_9_h',
  '10_19_h', '10_19_h',
  '20_49_h', '20_49_h',
  '50_99_h', '50_99_h',
  '100_199_h', '100_199_h',
  '200_499_h', '200_499_h',
  '500_h', '500_h'
)
colnames(df) <- make.unique(colnames(df))
df <- df[-(1:5), ]
df <- df %>%
  mutate(across(everything(), ~ gsub('X', '0', .))) %>%
  mutate(across(everything(), ~ gsub('-', '0', .))) %>%
  mutate(across(everything(), ~ gsub('Total', 'total', .))) %>%
  mutate(across(everything(), ~ gsub('Pecuária e criação de outros animais', 'livestock', .))) %>%
  mutate(across(everything(), ~ gsub('Maior que 0 e menor que 1 ha', '0_1_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 1 a menos de 2 ha', '1_2_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 2 a menos de 5 ha', '2_5_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 5 a menos de 10 ha', '5_10_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 10 a menos de 20 ha', '10_20_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 20 a menos de 50 ha', '20_50_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 50 a menos de 100 ha', '50_100_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 100 a menos de 200 ha', '100_200_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 200 a menos de 500 ha', '200_500_ha', .))) %>%
  mutate(across(everything(), ~ gsub('De 500 e mais ha', '500_ha', .)))

df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>% fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df_w <- df
cols_keep <- which(df[1, ] == "total")
df <- df[, c(1, 2, cols_keep)]
df <- df[-1, ]
df <- df %>% mutate(group_item = 'total')
df_1 <- df
cols_keep <- which(df_w[1, ] == "livestock")
df_w <- df_w[, c(1, 2, cols_keep)]
df_w <- df_w[-1, ]
df_w <- df_w %>% mutate(group_item = 'livestock')
df_2 <- df_w
names(df_2) <- gsub("\\.1$", "", names(df_2))
df <- bind_rows(df_1, df_2) %>%
  mutate( value = rowSums(across(c(`1_2_h`, `3_4_h`, `5_9_h`, `10_19_h`, `20_49_h`,
                                   `50_99_h`, `100_199_h`, `200_499_h`, `500_h`), 
                                 ~ as.numeric(.)), na.rm = TRUE),variable = 'head')
df_c <- df

## Append all group data
df <- bind_rows(df_e, df_c) %>% mutate(group_type = 'activity',size_bin_type = 'pasture_ha',year = 2006)
df_p <- df %>% dplyr::select(all_of(colnames(df95)))

### Herd size (head)

df_h <- df %>%
  dplyr::select(-value, -size_bin, -size_bin_type) %>%
  mutate(across(-c(year, group_type, group_item, variable, county_id), as.numeric)) %>%
  group_by(year, group_type, group_item, variable, county_id) %>%
  summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop') %>% ungroup() %>%
  pivot_longer(
    cols = -c(year, group_type, group_item, variable, county_id),
    names_to = "size_bin",
    values_to = "value"
  ) %>%
  mutate(size_bin_type = "cattle_head") %>%
  dplyr::select(all_of(colnames(df95)))

## Append all year data
df06 <- bind_rows(
  df_esh %>% mutate(value = as.numeric(value)),
  df_p %>% mutate(value = as.numeric(value)),
  df_h %>% mutate(value = as.numeric(value)) )



#################### 2017

###### Size (ha) by activity

### Number of establishments
df <- read_excel('../../data/size/raw/tabela6908.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-1)
colnames(df) <- c('county_id', 'size_bin', 'total', 'livestock')
df <- df[6:nrow(df), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id)) %>%
  mutate(across(everything(), ~gsub('X', '0', .))) %>% mutate(across(everything(), ~gsub('-', '0', .)))
df <- df %>% mutate(variable = 'establishments') %>%
  mutate(across(size_bin, ~recode(., 
                                  'Total' = 'total',
                                  'Mais de 0 a menos de 0,1 ha' = '0_01_ha',
                                  'De 0,1 a menos de 0,2 ha' = '01_02_ha',
                                  'De 0,2 a menos de 0,5 ha' = '02_05_ha',
                                  'De 0,5 a menos de 1 ha' = '05_1_ha',
                                  'De 1 a menos de 2 ha' = '1_2_ha',
                                  'De 2 a menos de 3 ha' = '2_3_ha',
                                  'De 3 a menos de 4 ha' = '3_4_ha',
                                  'De 4 a menos de 5 ha' = '4_5_ha',
                                  'De 5 a menos de 10 ha' = '5_10_ha',
                                  'De 10 a menos de 20 ha' = '10_20_ha',
                                  'De 20 a menos de 50 ha' = '20_50_ha',
                                  'De 50 a menos de 100 ha' = '50_100_ha',
                                  'De 100 a menos de 200 ha' = '100_200_ha',
                                  'De 200 a menos de 500 ha' = '200_500_ha',
                                  'De 500 a menos de 1.000 ha' = '500_1000_ha',
                                  'De 1.000 a menos de 2.500 ha' = '1000_2500_ha',
                                  'De 2.500 ha e mais'='2500_ha',
                                  'De 2.500 a menos de 10.000 ha' = '2500_10000_ha',
                                  'De 10.000 ha e mais' = '10000_ha')))
df_e <- df


### Cattle stock
df <- read_excel('../../data/size/raw/tabela6908.xlsx', sheet = 2, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-1)
colnames(df) <- c('county_id', 'size_bin', 'total', 'livestock')
df <- df[6:nrow(df), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id)) %>%
  mutate(across(everything(), ~gsub('X', '0', .))) %>%
  mutate(across(everything(), ~gsub('-', '0', .)))
df <- df %>% mutate(variable = 'head') %>%
  mutate(across(size_bin, ~recode(., 
                                  'Total' = 'total',
                                  'Mais de 0 a menos de 0,1 ha' = '0_01_ha',
                                  'De 0,1 a menos de 0,2 ha' = '01_02_ha',
                                  'De 0,2 a menos de 0,5 ha' = '02_05_ha',
                                  'De 0,5 a menos de 1 ha' = '05_1_ha',
                                  'De 1 a menos de 2 ha' = '1_2_ha',
                                  'De 2 a menos de 3 ha' = '2_3_ha',
                                  'De 3 a menos de 4 ha' = '3_4_ha',
                                  'De 4 a menos de 5 ha' = '4_5_ha',
                                  'De 5 a menos de 10 ha' = '5_10_ha',
                                  'De 10 a menos de 20 ha' = '10_20_ha',
                                  'De 20 a menos de 50 ha' = '20_50_ha',
                                  'De 50 a menos de 100 ha' = '50_100_ha',
                                  'De 100 a menos de 200 ha' = '100_200_ha',
                                  'De 200 a menos de 500 ha' = '200_500_ha',
                                  'De 500 a menos de 1.000 ha' = '500_1000_ha',
                                  'De 1.000 a menos de 2.500 ha' = '1000_2500_ha',
                                  'De 2.500 ha e mais'='2500_ha',
                                  'De 2.500 a menos de 10.000 ha' = '2500_10000_ha',
                                  'De 10.000 ha e mais' = '10000_ha' )))
df_c <- df

## Append all group data
df <- bind_rows(df_e, df_c)
df <- df %>%
  mutate(group_type = 'activity') %>%
  pivot_longer(cols = -c(group_type, size_bin, variable, county_id),
               names_to = 'group_item', values_to = 'value') %>%
  mutate(size_bin_type = 'establishment_size_ha', year = 2017) %>%
  filter(size_bin %in% c(
    '0_01_ha', '01_02_ha', '02_05_ha', '05_1_ha', '1_2_ha', '2_3_ha', '3_4_ha', '4_5_ha',
    '5_10_ha', '10_20_ha', '20_50_ha', '50_100_ha', '100_200_ha', '200_500_ha', '500_1000_ha',
    '1000_2500_ha', '2500_ha'))
df_esh <- df %>% dplyr::select(all_of(colnames(df95)))


###### Pasture size (ha)

### Number of establishments
df <- read_excel('../../data/size/raw/tabela6911/tabela6911_pasture.xlsx', sheet = 1, .name_repair = "unique_quiet") %>%
  slice(-n()) %>% dplyr::select(-2) %>%     
  mutate(across(everything(), ~replace(., . %in% c('X', '-', ''), 0))) %>% 
  setNames(c('county_id', '0_1_ha', '1_2_ha', '2_5_ha', '5_10_ha', '10_20_ha', '20_50_ha', '50_100_ha', '100_200_ha', '200_500_ha', '500_ha')) %>%
  slice(-1:-5) %>%    
  pivot_longer(cols = -county_id,names_to = "size_bin",values_to = "value") %>%
  mutate(group_type = "activity",group_item = "total",variable = "establishments")
df_e <- df

### Cattle stocks
df <- read_excel('../../data/size/raw/tabela6911/tabela6911_pasture.xlsx', sheet = 2, .name_repair = "unique_quiet") %>%
  slice(-n()) %>% dplyr::select(-2) %>%     
  mutate(across(everything(), ~replace(., . %in% c('X', '-', ''), 0))) %>% 
  setNames(c('county_id', '0_1_ha', '1_2_ha', '2_5_ha', '5_10_ha', '10_20_ha', '20_50_ha', '50_100_ha', '100_200_ha', '200_500_ha', '500_ha')) %>%
  slice(-1:-5) %>% pivot_longer(cols = -county_id,names_to = "size_bin",values_to = "value") %>%
  mutate(group_type = "activity",group_item = "total",variable = "head")
df_c <- df

## Append all group data
df <- bind_rows(df_e, df_c) %>% mutate(county_id = paste0('BR', as.character(county_id)),year = 2017,
                                       size_bin_type = 'pasture_ha') %>% dplyr::select(all_of(colnames(df95)))
df_p <- df


###### Herd size (head)

### Number of establishments
df <- read_excel('../../data/size/raw/tabela6910.xlsx', sheet = 1, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
colnames(df) <- c('county_id', 'size_bin', 'group_item', 'value')
df <- df[6:nrow(df), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>% mutate(size_bin = ifelse(size_bin == "NA" | is.na(size_bin), NA, size_bin)) %>%
  fill(size_bin, .direction = "down") %>% mutate(size_bin = paste0("BR", size_bin))
df <- df %>% mutate(across(everything(), ~ case_when(
    . == "X" ~ "0",
    . == "-" ~ "0",
    . == "Total" ~ "total",
    . == "Pecuária e criação de outros animais" ~ "livestock",
    . == "BRDe 1 a 2" ~ "1_2_h",
    . == "BRDe 3 a 5" ~ "3_5_h",
    . == "BRDe 6 a 10" ~ "6_10_h",
    . == "BRDe 11 a 20" ~ "11_20_h",
    . == "BRDe 21 a 50" ~ "21_50_h",
    . == "BRDe 51 a 100" ~ "51_100_h",
    . == "BRDe 101 a 200" ~ "101_200_h",
    . == "BRDe 201 a 500" ~ "201_500_h",
    . == "BRDe 501 a 1000" ~ "501_1000_h",
    . == "BRDe 1001 e mais" ~ "1001_h",
    TRUE ~ as.character(.))))
df <- df %>% mutate(variable = 'establishments')
df_e <- df

### Cattle stocks
df <- read_excel('../../data/size/raw/tabela6910.xlsx', sheet = 2, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
colnames(df) <- c('county_id', 'size_bin', 'group_item', 'value')
df <- df[6:nrow(df), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>% fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>% mutate(size_bin = ifelse(size_bin == "NA" | is.na(size_bin), NA, size_bin)) %>%
  fill(size_bin, .direction = "down") %>% mutate(size_bin = paste0("BR", size_bin))
df <- df %>% mutate(across(everything(), ~ case_when(
    . == "X" ~ "0",
    . == "-" ~ "0",
    . == "Total" ~ "total",
    . == "Pecuária e criação de outros animais" ~ "livestock",
    . == "BRDe 1 a 2" ~ "1_2_h",
    . == "BRDe 3 a 5" ~ "3_5_h",
    . == "BRDe 6 a 10" ~ "6_10_h",
    . == "BRDe 11 a 20" ~ "11_20_h",
    . == "BRDe 21 a 50" ~ "21_50_h",
    . == "BRDe 51 a 100" ~ "51_100_h",
    . == "BRDe 101 a 200" ~ "101_200_h",
    . == "BRDe 201 a 500" ~ "201_500_h",
    . == "BRDe 501 a 1000" ~ "501_1000_h",
    . == "BRDe 1001 e mais" ~ "1001_h",
    TRUE ~ as.character(.) )))
df <- df %>% mutate(variable = 'head')
df_c <- df

## Append all group data
df <- bind_rows(df_e, df_c) %>% mutate(year = 2017, group_type = "activity", size_bin_type = "cattle_head") %>% 
  dplyr::select(all_of(colnames(df95)))
df_h <- df

## Append all year data
df <- bind_rows(df_esh, df_p, df_h)
df17 <- df

## Append all years
df <- bind_rows(
  df06 %>% mutate(across(everything(), as.character)),
  df17   %>% mutate(across(everything(), as.character)),
  df95   %>% mutate(across(everything(), as.character)) ) %>%
  dplyr::select(group_type, group_item, variable, county_id, size_bin, value, year, size_bin_type)
write_csv(df, "../../data/size/clean/brazil_size_distribution_livestock.csv.gz")



###### Stocking rate data
df <- read_excel('../../data/size/raw/tabela925.xlsx', sheet = 1, .name_repair = "unique_quiet") %>% slice(-n()) %>%  dplyr::select(-4)
colnames(df) <- c('county_id', 'group_item', 'size_bin',
                  'total','corte','corte_cria','corte_recria','corte_engorda','corte_cria_recria','corte_cria_engorda','corte_recria_engorda','corte_cria_recria_engorda',
                  'total','corte','corte_cria','corte_recria','corte_engorda','corte_cria_recria','corte_cria_engorda','corte_recria_engorda','corte_cria_recria_engorda',
                  'total','corte','corte_cria','corte_recria','corte_engorda','corte_cria_recria','corte_cria_engorda','corte_recria_engorda','corte_cria_recria_engorda')
colnames(df) <- make.unique(colnames(df))
df <- df %>% mutate(across(everything(), ~str_replace_all(., c(
    "X" = "0", "-" = "0",
    "Total" = "total", "Pecuária e criação de outros animais" = "livestock",
    "Maior que 0 e menor que 1 ha" = "0_1_ha",
    "De 1 a menos de 2 ha" = "1_2_ha",
    "De 2 a menos de 5 ha" = "2_5_ha",
    "De 5 a menos de 10 ha" = "5_10_ha",
    "De 10 a menos de 20 ha" = "10_20_ha",
    "De 20 a menos de 50 ha" = "20_50_ha",
    "De 50 a menos de 100 ha" = "50_100_ha",
    "De 100 a menos de 200 ha" = "100_200_ha",
    "De 200 a menos de 500 ha" = "200_500_ha",
    "De 500 e mais" = "500_ha"
  ))))
df <- df %>% slice(7:n()) 
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>%
  fill(county_id, .direction = "down") %>% mutate(county_id = paste0("BR", county_id))
df <- df %>% mutate(group_item = ifelse(group_item == "NA" | is.na(group_item), NA, group_item)) %>%
  fill(group_item, .direction = "down") %>% mutate(group_item = paste0("BR", group_item)) %>%
  mutate(across(c(group_item), ~str_replace_all(., c(
    "BRtotal" = "total",
    "BRlivestock" = "livestock"))))
df_1 <- df %>% dplyr::select(4:12) %>%
  mutate(county_id = df$county_id, group_item = df$group_item, size_bin = df$size_bin, variable = "0_1_y") %>%
  relocate(county_id, group_item, size_bin, variable)
df_2 <- df %>%
  dplyr::select(13:21) %>%
  mutate(county_id = df$county_id,group_item = df$group_item,size_bin = df$size_bin,variable = "1_2_y") %>%
  relocate(county_id, group_item, size_bin, variable)
colnames(df_2) <- gsub("\\.1", "", colnames(df_2))
df_3 <- df %>%dplyr::select(22:30) %>%
  mutate( county_id = df$county_id,group_item = df$group_item,size_bin = df$size_bin,variable = "2_y") %>%
  relocate(county_id, group_item, size_bin, variable)
colnames(df_3) <- gsub("\\.2", "", colnames(df_3))

df <- bind_rows(df_1, df_2, df_3)
df <- df %>% mutate(group_type = 'activity')
df <- df %>% pivot_longer(cols = -c(county_id, group_type, group_item, variable, size_bin), names_to = "end_use",values_to = "value")
df_c <- df

###### Pasture area
df <- read_excel('../../data/size/raw/tabela1421.xlsx', sheet = 2, .name_repair = "unique_quiet")
df <- df[-nrow(df), ]
df <- df %>% dplyr::select(-3)
colnames(df) <- c('county_id', 'size_bin', 'pasture_natural', 'pasture_planted_degraded', 'pasture_planted', 'agroforestry')
df <- df[6:nrow(df), ]
df <- df %>% mutate(county_id = ifelse(county_id == "NA" | is.na(county_id), NA, county_id)) %>% fill(county_id, .direction = "down")
df <- df %>% mutate(county_id = paste0("BR", county_id))
df <- df %>% mutate(across( everything(),
                            ~ recode(., 
                                     'X' = '0',
                                     '-' = '0',
                                     'Total' = 'total',
                                     'Pecuária e criação de outros animais' = 'livestock',
                                     'Maior que 0 e menor que 1 ha' = '0_1_ha',
                                     'De 1 a menos de 2 ha' = '1_2_ha',
                                     'De 2 a menos de 5 ha' = '2_5_ha',
                                     'De 5 a menos de 10 ha' = '5_10_ha',
                                     'De 10 a menos de 20 ha' = '10_20_ha',
                                     'De 20 a menos de 50 ha' = '20_50_ha',
                                     'De 50 a menos de 100 ha' = '50_100_ha',
                                     'De 100 a menos de 200 ha' = '100_200_ha',
                                     'De 200 a menos de 500 ha' = '200_500_ha',
                                     'De 500 e mais ha' = '500_ha')))
df <- df %>% pivot_longer(cols = -c(county_id, size_bin),names_to = 'group_item',values_to = 'value')
df <- df %>% mutate(group_type = 'land_use',variable = 'area_ha',end_use = 'total' )
df_a <- df %>% dplyr::select(all_of(colnames(df_c)))

## Append all group data
df <- bind_rows(df_a, df_c) %>% mutate(size_bin_type = "pasture_ha",year = 2006 )
write_csv(df, "../../data/size/clean/brazil_stocking_rates.csv.gz")



###################################################### Argentina

#################### 2008

province_list<-c('buenosaires','catamarca','chaco','chubut','cordoba','corrientes','entrerios',
                 'formosa','jujuy','lapampa','larioja','mendoza','misiones','neuquen','rionegro',
                 'salta','sanjuan','sanluis','santacruz','santafe','santiagodelestero',
                 'tierradelfuego','tucuman')
province_name <-c('Buenos Aires','Catamarca','Chaco','Chubut','Cordoba','Corrientes','Entre Rios',
                  'Formosa','Jujuy','La Pampa','La Rioja','Mendoza','Misiones','Neuquen','Rio Negro',
                  'Salta','San Juan','San Luis','Santa Cruz','Santa Fe','Santiago del Estero',
                  'Tierra del Fuego','Tucuman')
df_2008 <- data.frame()

for (n_p in 1:23) {
  
  file_path <- paste0("../../data/size/raw/cna2008/", province_list[n_p], ".xls")
  df <- read_excel(file_path, sheet = "clean", .name_repair = "unique_quiet")
  
  df <- df %>% dplyr::select(-3, -4, -5)
  df <- df %>%  mutate(across(everything(), ~replace(., . %in% c("-", "s"), 0))) %>%
    mutate(year = "2008",state = province_name[n_p]) %>%
    relocate(year, state) %>% rename(county = Partido, variable = Variable)
  df <- df %>% mutate(across(c(county, state), ~ stri_trans_general(as.character(.), "Latin-ASCII"))) %>%
    mutate(across(c(county, state), ~ toupper(str_trim(.))))
  df <- df %>% mutate(across(-c(year, state, county, variable), as.character))
  df1 <- df %>% filter(is.na(county))
  df2 <- df %>% filter(!is.na(county))
  df1$county <- df2$county
  df_12 <- bind_rows(df1, df2)
  df_melted <- df_12 %>%  pivot_longer(cols = -c(year, state, county, variable),names_to = "size_bin",values_to = "value")
  
  if (n_p == 1) {
    df_2008 <- df_melted
  } else {
    df_2008 <- bind_rows(df_2008, df_melted)
  }
}

df <- df_2008 %>% mutate(county = recode(county,
                                         '1 DE MAYO' = 'PRIMERO DE MAYO',
                                         '1° DE MAYO'='PRIMERO DE MAYO',
                                         '12 DE OCTUBRE' = 'DOCE DE OCTUBRE',
                                         '2 DE ABRIL' = 'DOS DE ABRIL',
                                         '25 DE MAYO' = 'VEINTICINCO DE MAYO',
                                         '9 DE JULIO' = 'NUEVE DE JULIO',
                                         'ADOLFO GONZALES CHAVES' = 'GONZALES CHAVES',
                                         'CORONEL DE MARINA L ROSALES' = 'CORONEL ROSALES',
                                         'CORONEL DE MARINA LEONARDO ROSALES' = 'CORONEL ROSALES',
                                         'CONCEPCION DE LA SIERRA' = 'CONCEPCION',
                                         'FLORENTINO AMEGHINO' = 'AMEGHINO',
                                         'GENERAL JUAN MADARIAGA' = 'GENERAL MADARIAGA',
                                         'LDOR GRAL SAN MARTIN' = 'LIBERTADOR GENERAL SAN MARTIN',
                                         'LEANDRO N ALEM' = 'LEANDRO N. ALEM',
                                         'GRAL BELGRANO' = 'GENERAL MANUEL BELGRANO',
                                         'LA CANDELARIA' = 'CANDELARIA',
                                         'DR MANUEL BELGRANO' = 'CAPITAL',
                                         'LA CAPITAL' = 'CAPITAL'))

df_1 <- df
df_2 <- read_csv('../../data/landuse/clean/geographicunits_argentina.csv.gz',show_col_types = FALSE)
df <- df_2 %>% inner_join(df_1, by = c("county", "state")) %>%
  dplyr::select(year, county_id, county, state_id, state, variable, size_bin, value)

#Standardize size bins
old_bins <- c('Hasta  5', 'Hasta 5',
              '5,1 - 10', '5,1 a 10',
              '10,1 - 25','10,1 a 25', 'Hasta 25', 
              '25,1 - 50', '25,1 a 50',
              '50,1 - 100', '50,1 - 75', '50,1 a 100','75,1 - 100',
              '100,1 - 150', '100,1 - 200','100,1 a 200','150,1 - 200',
              '200,1 - 300', '200,1 - 500', '200,1 a 500', '300,1 - 500',
              '500,1 - 1.000', '500,1 a 1.000','Hasta 1000',
              '1.000,1 - 1.500', '1.000,1 - 2.500', '1.000,1 - 5.000',
              '1.500 - 2.500', '1.500,1 - 2.000', '1.500,1 - 2.500', 
              '2.000,1 - 2.500',
              '2.500,1 - 3.500','2.500,1 - 5.000','3.500,1 - 5.000',
              '5.000,1 - 10.00', '5.000,1 - 10.000','5.000,1 - 7.500','7.500,1 - 10.000','7500,1 - 10.000', 
              '10.000,1 - 20.000',
              'Más de 1.000',
              'Más de 5.000',
              'Más de 10.000',
              'Más de 20.000','Mas de 20.000')

new_bins <- c('0_5_ha','0_5_ha',
              '5_10_ha','5_10_ha',
              '10_25_ha','10_25_ha','10_25_ha',
              '25_50_ha','25_50_ha',
              '50_100_ha','50_100_ha','50_100_ha','50_100_ha',
              '100_200_ha','100_200_ha','100_200_ha','100_200_ha',
              '200_500_ha','200_500_ha','200_500_ha','200_500_ha',
              '500_1000_ha','500_1000_ha','500_1000_ha',
              '1000_5000_ha','1000_5000_ha','1000_5000_ha',
              '1000_5000_ha','1000_5000_ha','1000_5000_ha',
              '1000_5000_ha',
              '1000_5000_ha','1000_5000_ha','1000_5000_ha',
              '5000_10000_ha','5000_10000_ha','5000_10000_ha','5000_10000_ha','5000_10000_ha',
              '10000_20000_ha',
              '1000_5000_ha',
              '5000_10000_ha',
              '10000_20000_ha',
              '20000_ha','20000_ha')

df <- df %>% mutate(size_bin = recode(size_bin, !!!setNames(new_bins, old_bins)),
                    variable = recode(variable, "ha" = "area_ha", "EAP" = "establishments") )
df_2008_clean <- df


#################### 2018

province_list <- c("buenosaires", "catamarca", "chaco", "chubut", "cordoba", "corrientes", "entrerios",
                   "formosa", "jujuy", "lapampa", "larioja", "mendoza", "misiones", "neuquen", "rionegro",
                   "salta", "sanjuan", "sanluis", "santacruz", "santafe", "santiagodelestero",
                   "tierradelfuego", "tucuman")
province_name <- c("Buenos Aires", "Catamarca", "Chaco", "Chubut", "Cordoba", "Corrientes", "Entre Rios",
                   "Formosa", "Jujuy", "La Pampa", "La Rioja", "Mendoza", "Misiones", "Neuquen", "Rio Negro",
                   "Salta", "San Juan", "San Luis", "Santa Cruz", "Santa Fe", "Santiago del Estero","Tierra del Fuego", "Tucuman")
df_2018 <- data.frame()

for (n_p in seq_along(province_list)) {
  df <- read_excel('../../data/size/raw/cna2018/all.xlsx', sheet = province_list[n_p], .name_repair = "unique_quiet")
  df <- df %>% dplyr::select(-3)
  df <- df %>%mutate(across(everything(), ~ replace(., . %in% c("-", "s"), 0))) %>%
    mutate(year = "2018",state = province_name[n_p]) %>%
    relocate(year, state) %>% rename(county = Partido, variable = Variable) %>%
    mutate(county = county %>% stri_trans_general("Latin-ASCII") %>% str_to_upper() %>%
             str_trim(),state = state %>% stri_trans_general("Latin-ASCII") %>% str_to_upper() %>%str_trim())
  df <- df %>% mutate(across(-c(year, state, county, variable), as.character))
  df1 <- df %>% filter(is.na(county))
  df2 <- df %>% filter(!is.na(county))
  
  if (nrow(df1) > 0) {df1$county <- df2$county}
  df_12 <- bind_rows(df1, df2)
  
  df_melted <- df_12 %>% pivot_longer(cols = -c(year, state, county, variable),names_to = "size_bin",values_to = "value")
  
  if (n_p == 1) {df_2018 <- df_melted} else { df_2018 <- bind_rows(df_2018, df_melted)}
}

df <- df_2018 %>% mutate(county = recode(county,
                                         '1 DE MAYO' = 'PRIMERO DE MAYO',
                                         '1° DE MAYO'='PRIMERO DE MAYO',
                                         '12 DE OCTUBRE' = 'DOCE DE OCTUBRE',
                                         '2 DE ABRIL' = 'DOS DE ABRIL',
                                         '25 DE MAYO' = 'VEINTICINCO DE MAYO',
                                         '9 DE JULIO' = 'NUEVE DE JULIO',
                                         'ADOLFO GONZALES CHAVES' = 'GONZALES CHAVES',
                                         'CORONEL DE MARINA L ROSALES' = 'CORONEL ROSALES',
                                         'CORONEL DE MARINA LEONARDO ROSALES' = 'CORONEL ROSALES',
                                         'CONCEPCION DE LA SIERRA' = 'CONCEPCION',
                                         'FLORENTINO AMEGHINO' = 'AMEGHINO',
                                         'GENERAL JUAN MADARIAGA' = 'GENERAL MADARIAGA',
                                         'LDOR GRAL SAN MARTIN' = 'LIBERTADOR GENERAL SAN MARTIN',
                                         'LEANDRO N ALEM' = 'LEANDRO N. ALEM',
                                         'GRAL BELGRANO' = 'GENERAL MANUEL BELGRANO',
                                         'LA CANDELARIA' = 'CANDELARIA',
                                         'DR MANUEL BELGRANO' = 'CAPITAL',
                                         'LA CAPITAL' = 'CAPITAL'))

df_1 <- df
df_2 <- read_csv('../../data/landuse/clean/geographicunits_argentina.csv.gz',show_col_types = FALSE)
df <- df_2 %>% inner_join(df_1, by = c("county", "state")) %>%
  dplyr::select(year, county_id, county, state_id, state, variable, size_bin, value)

#Standardize size bins
old_bins <- c('Hasta  5', 'Hasta 5',
              '5,1 - 10', '5,1 a 10',
              '10,1 - 25','10,1 a 25', 'Hasta 25', 
              '25,1 - 50', '25,1 a 50',
              '50,1 - 100', '50,1 - 75', '50,1 a 100','75,1 - 100',
              '100,1 - 150', '100,1 - 200','100,1 a 200','150,1 - 200',
              '200,1 - 300', '200,1 - 500', '200,1 a 500', '300,1 - 500',
              '500,1 - 1.000', '500,1 a 1.000','Hasta 1000',
              '1.000,1 - 1.500', '1.000,1 - 2.500', '1.000,1 - 5.000',
              '1.500 - 2.500', '1.500,1 - 2.000', '1.500,1 - 2.500', 
              '2.000,1 - 2.500',
              '2.500,1 - 3.500','2.500,1 - 5.000','3.500,1 - 5.000',
              '5.000,1 - 10.00', '5.000,1 - 10.000','5.000,1 - 7.500','7.500,1 - 10.000','7500,1 - 10.000', 
              '10.000,1 - 20.000',
              'Más de 1.000',
              'Más de 5.000',
              'Más de 10.000',
              'Más de 20.000','Mas de 20.000')

new_bins <- c('0_5_ha','0_5_ha',
              '5_10_ha','5_10_ha',
              '10_25_ha','10_25_ha','10_25_ha',
              '25_50_ha','25_50_ha',
              '50_100_ha','50_100_ha','50_100_ha','50_100_ha',
              '100_200_ha','100_200_ha','100_200_ha','100_200_ha',
              '200_500_ha','200_500_ha','200_500_ha','200_500_ha',
              '500_1000_ha','500_1000_ha','500_1000_ha',
              '1000_5000_ha','1000_5000_ha','1000_5000_ha',
              '1000_5000_ha','1000_5000_ha','1000_5000_ha',
              '1000_5000_ha',
              '1000_5000_ha','1000_5000_ha','1000_5000_ha',
              '5000_10000_ha','5000_10000_ha','5000_10000_ha','5000_10000_ha','5000_10000_ha',
              '10000_20000_ha',
              '1000_5000_ha',
              '5000_10000_ha',
              '10000_20000_ha',
              '20000_ha','20000_ha')

df <- df %>% mutate(size_bin = recode(size_bin, !!!setNames(new_bins, old_bins)),
                     variable = recode(variable, "Hectáreas" = "area_ha", "EAP" = "establishments"))
df_2018_clean <- df
df <- bind_rows(df_2008_clean, df_2018_clean) %>% mutate(across(where(is.numeric), ~ replace_na(., 0)))
write_csv(df, "../../data/size/clean/argentina_size_distribution_land.csv.gz")







